1 Introduction to the case study

This Case of study explores and analyses data sets of restaurants reservations and activities for two distant touristic area in Japan. The data set covers information of two years. The goal of this study is to predict restaurant activities depending of dataset history. The challenge is to decipher knowledge about the behaviors of the visitors in two distant areas depending in multiple parameter like weekend, holidays, whether, gender, kind of dishes and times. The data set comes from kaggle competition named Recruit restaurant visitor forecasting. This tutorial is inspired from the winner of this competition which is Heads or Tails. All respect and thanks for his work and data exploratory and analysis. It could be considered as a lesson for bigger on data analysis using R programming language.

1.1 Glimpse of the data sets

The data comes in the shape of 8 relational files which are derived from two separate Japanese websites that collect user information: “Hot Pepper Gourmet (hpg): similar to Yelp” (search and reserve) and “AirREGI / Restaurant Board (air): similar to Square” (reservation control and cash register). The training data is based on the time range of Jan 2016 - most of Apr 2017, while the test set includes the last week of Apr plus May 2017." The test data intentionally spans a holiday week in Japan called the ‘Golden Week.’ The data description further notes that:”There are days in the test set where the restaurant were closed and had no visitors. These are ignored in scoring. The training set omits days where the restaurants were closed."

air_visit_data.csv: N° visitors per store and per Date Air visit

date_info.csv: Essentially flags the Japanese holidays. data info

air_reserve.csv: Reservations and Visitors made through the air. Air reserve

hpg_reserve.csv: Reservations and Visitors for hpg location. hpg reserve

air_store_info.csv: details about the air sotres including genre and location. Air store info

hpg_store_info.csv: details about the hpg stores including genre and location. hpg store info

store_id_relation.csv: connects the air and hpg ids Stor ids relation

sample_submission.csv: serves as the test set. The id is formed by combining the air id with the visit date.

Sample submission

Sample submission

1.2 Load librairies

1.3 Load data

air_visit <- as_tibble(fread(str_c("air_visit_data.csv")))
air_reserve <- as_tibble(fread(str_c('air_reserve.csv'))) 
air_store_info <- as_tibble(fread(str_c('air_store_info.csv')))
date_info <- as_tibble(fread(str_c('date_info.csv')))
hpg_reserve <- as_tibble(fread(str_c('hpg_reserve.csv')))
hpg_store_info <- as_tibble((fread(str_c('hpg_store_info.csv'))))
sample_submission <- as_tibble(fread(str_c('sample_submission.csv')))
store_id_relation <- as_tibble(fread(str_c('store_id_relation.csv')))

1.4 multiplot function

# Define multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

2 Overview: File structure and content

2.1 Air visits

There are two mains functions to summarize a dataframe or tibble:

  • summary is useful if with have data frame with numeric value. It computes and display the Min, Max, Media, mean, and quaters of each column.

  • glimpse commes from tibble package. It displays the summary of the dataset in other way.

summary(air_visit)
##  air_store_id        visit_date           visitors     
##  Length:252108      Length:252108      Min.   :  1.00  
##  Class :character   Class :character   1st Qu.:  9.00  
##  Mode  :character   Mode  :character   Median : 17.00  
##                                        Mean   : 20.97  
##                                        3rd Qu.: 29.00  
##                                        Max.   :877.00
# verify if is there a missing value
all(is.na(air_visit))
## [1] FALSE
tibble::glimpse(air_visit)
## Observations: 252,108
## Variables: 3
## $ air_store_id <chr> "air_ba937bf13d40fb24", "air_ba937bf13d40fb24", "ai…
## $ visit_date   <chr> "2016-01-13", "2016-01-14", "2016-01-15", "2016-01-…
## $ visitors     <int> 25, 32, 29, 22, 6, 9, 31, 21, 18, 26, 21, 11, 24, 2…
# Print the distinct number of stores in air place 
air_visit %>% dplyr::distinct(air_store_id) %>% nrow()
## [1] 829
# Print  the N° of distinct datetime for air place
air_visit %>% dplyr::distinct(visit_date) %>% nrow()
## [1] 478

2.2 Air Reserve

summary(air_reserve)
##  air_store_id       visit_datetime     reserve_datetime  
##  Length:92378       Length:92378       Length:92378      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  reserve_visitors 
##  Min.   :  1.000  
##  1st Qu.:  2.000  
##  Median :  3.000  
##  Mean   :  4.482  
##  3rd Qu.:  5.000  
##  Max.   :100.000
# verify if is there a missing value
all(is.na(air_reserve))
## [1] FALSE
glimpse(air_reserve)
## Observations: 92,378
## Variables: 4
## $ air_store_id     <chr> "air_877f79706adbfb06", "air_db4b38ebe7a7ceff",…
## $ visit_datetime   <chr> "2016-01-01 19:00:00", "2016-01-01 19:00:00", "…
## $ reserve_datetime <chr> "2016-01-01 16:00:00", "2016-01-01 19:00:00", "…
## $ reserve_visitors <int> 1, 3, 6, 2, 5, 2, 4, 2, 2, 2, 3, 3, 2, 6, 7, 41…
# print the distinct store in air place that had reservation
air_reserve %>% dplyr::distinct(air_store_id) %>% nrow()
## [1] 314

There are 314 of 829 stores that use reservation process in air space. There are 252,108 reservation for 314 stores

2.3 HPG Reserve

We apply the same code for hpg stores.

summary(hpg_reserve)
##  hpg_store_id       visit_datetime     reserve_datetime  
##  Length:2000320     Length:2000320     Length:2000320    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  reserve_visitors 
##  Min.   :  1.000  
##  1st Qu.:  2.000  
##  Median :  3.000  
##  Mean   :  5.074  
##  3rd Qu.:  6.000  
##  Max.   :100.000
# verify if is there a missing value
all(is.na(hpg_reserve))
## [1] FALSE
tibble::glimpse(hpg_reserve)
## Observations: 2,000,320
## Variables: 4
## $ hpg_store_id     <chr> "hpg_c63f6f42e088e50f", "hpg_dac72789163a3f47",…
## $ visit_datetime   <chr> "2016-01-01 11:00:00", "2016-01-01 13:00:00", "…
## $ reserve_datetime <chr> "2016-01-01 09:00:00", "2016-01-01 06:00:00", "…
## $ reserve_visitors <int> 1, 3, 2, 5, 13, 2, 2, 2, 2, 6, 2, 2, 2, 2, 5, 4…
# print the distinct store in hpg space
hpg_reserve %>% dplyr::distinct(hpg_store_id) %>% nrow()
## [1] 13325

There are 2,000,320 reservation for 13325 stores. In hpg space, it seems that all restaurants use reservation process.

2.4 Air Store informations: genre and location

summary(air_store_info)
##  air_store_id       air_genre_name     air_area_name         latitude    
##  Length:829         Length:829         Length:829         Min.   :33.21  
##  Class :character   Class :character   Class :character   1st Qu.:34.70  
##  Mode  :character   Mode  :character   Mode  :character   Median :35.66  
##                                                           Mean   :35.65  
##                                                           3rd Qu.:35.69  
##                                                           Max.   :44.02  
##    longitude    
##  Min.   :130.2  
##  1st Qu.:135.3  
##  Median :139.7  
##  Mean   :137.4  
##  3rd Qu.:139.8  
##  Max.   :144.3
# verify if is there a missing value
all(is.na(air_store_info))
## [1] FALSE
tibble::glimpse(air_store_info)
## Observations: 829
## Variables: 5
## $ air_store_id   <chr> "air_0f0cdeee6c9bf3d7", "air_7cc17a324ae5c7dc", "…
## $ air_genre_name <chr> "Italian/French", "Italian/French", "Italian/Fren…
## $ air_area_name  <chr> "Hyōgo-ken Kōbe-shi Kumoidōri", "Hyōgo-ken Kōbe-s…
## $ latitude       <dbl> 34.69512, 34.69512, 34.69512, 34.69512, 35.65807,…
## $ longitude      <dbl> 135.1979, 135.1979, 135.1979, 135.1979, 139.7516,…
# print the number of air area in air space
air_store_info %>% dplyr::distinct(air_area_name) %>% nrow()
## [1] 103
# print the number of stores in air space
air_store_info %>% dplyr::distinct(air_store_id) %>% nrow()
## [1] 829
# print the number of latitude for stores in air space
air_store_info %>% dplyr::distinct(latitude) %>% nrow()
## [1] 108
# print the number of longitude for stores in air space
air_store_info %>% dplyr::distinct(longitude) %>% nrow()
## [1] 108
# print the number of cuisine genre in air space
air_store_info %>% dplyr::distinct(air_genre_name) %>% nrow()
## [1] 14

2.5 HPG Stores informations: genre and locations

summary(hpg_store_info)
##  hpg_store_id       hpg_genre_name     hpg_area_name         latitude    
##  Length:4690        Length:4690        Length:4690        Min.   :33.31  
##  Class :character   Class :character   Class :character   1st Qu.:34.69  
##  Mode  :character   Mode  :character   Mode  :character   Median :35.66  
##                                                           Mean   :35.81  
##                                                           3rd Qu.:35.70  
##                                                           Max.   :43.77  
##    longitude    
##  Min.   :130.3  
##  1st Qu.:135.5  
##  Median :139.5  
##  Mean   :137.7  
##  3rd Qu.:139.7  
##  Max.   :143.7
# verify if is there a missing value
all(is.na(hpg_store_info))
## [1] FALSE
tibble::glimpse(hpg_store_info)
## Observations: 4,690
## Variables: 5
## $ hpg_store_id   <chr> "hpg_6622b62385aec8bf", "hpg_e9e068dd49c5fa00", "…
## $ hpg_genre_name <chr> "Japanese style", "Japanese style", "Japanese sty…
## $ hpg_area_name  <chr> "Tōkyō-to Setagaya-ku Taishidō", "Tōkyō-to Setaga…
## $ latitude       <dbl> 35.64367, 35.64367, 35.64367, 35.64367, 35.64367,…
## $ longitude      <dbl> 139.6682, 139.6682, 139.6682, 139.6682, 139.6682,…
# print le number of latitudes for hpg stores
hpg_store_info %>% dplyr::distinct(latitude) %>% nrow()
## [1] 129
# print the number of longitudes for hpg stores
hpg_store_info %>% dplyr::distinct(longitude) %>% nrow()
## [1] 129
# print the number of area names in hpg space
hpg_store_info %>% dplyr::distinct(hpg_area_name) %>% nrow()
## [1] 119
# print the number of genre of cuisine un HPG space
hpg_store_info %>% dplyr::distinct(hpg_genre_name) %>% nrow()
## [1] 34

2.6 Holidayes

glimpse(date_info)
## Observations: 517
## Variables: 3
## $ calendar_date <chr> "2016-01-01", "2016-01-02", "2016-01-03", "2016-01…
## $ day_of_week   <chr> "Friday", "Saturday", "Sunday", "Monday", "Tuesday…
## $ holiday_flg   <int> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
# verify if is there a missing value
all(is.na(date_info))
## [1] FALSE
# print the number of days for all periode of this case study
date_info %>% dplyr::distinct(calendar_date) %>% nrow() %>% days()
## [1] "517d 0H 0M 0S"
# print the number of days with holidays
date_info %>% dplyr::distinct(calendar_date, holiday_flg) %>% filter(holiday_flg == "1") %>% nrow() 
## [1] 35

2.7 Store IDs

glimpse(store_id_relation)
## Observations: 150
## Variables: 2
## $ air_store_id <chr> "air_63b13c56b7201bd9", "air_a24bf50c3e90d583", "ai…
## $ hpg_store_id <chr> "hpg_4bc649e72e2a239a", "hpg_c34b496d0305a809", "hp…

There are 150 relations between Air (829 stores) and HPG (4690 stores) stores.

2.8 Sample Submission

tibble::glimpse(sample_submission)
## Observations: 32,019
## Variables: 2
## $ id       <chr> "air_00a91d42b08b08d9_2017-04-23", "air_00a91d42b08b08d…
## $ visitors <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

the submission file should be two columns. The first one is the concatenation of store_id and date. the 2sd is the number of visitors.

2.9 Missing Values

purrr::map(ls(), is.na)
## [[1]]
## [1] FALSE
## 
## [[2]]
## [1] FALSE
## 
## [[3]]
## [1] FALSE
## 
## [[4]]
## [1] FALSE
## 
## [[5]]
## [1] FALSE
## 
## [[6]]
## [1] FALSE
## 
## [[7]]
## [1] FALSE
## 
## [[8]]
## [1] FALSE
## 
## [[9]]
## [1] FALSE

2.10 Reformating dataset

# convert date from chracter to date type
air_visit <- air_visit %>% mutate(visit_date = ymd(visit_date))

# convert date_time from character to date_time type
air_reserve <- air_reserve %>% mutate(visit_datetime = ymd_hms(visit_datetime),
                                      reserve_datetime = ymd_hms(reserve_datetime))

# convert date_time from character to date_time type
hpg_reserve <- hpg_reserve %>% mutate(visit_datetime = ymd_hms(visit_datetime),
                                      reserve_datetime = ymd_hms(reserve_datetime))

# convert genre and area name as factor
air_store_info <- air_store_info %>% mutate(air_genre_name = as.factor(air_genre_name),
                                            air_area_name = as.factor(air_area_name))

# convert genre and area name as factor
hpg_store_info <- hpg_store_info %>% mutate(hpg_genre_name = as.factor(hpg_genre_name),
                                            hpg_area_name = as.factor(hpg_area_name))

# convert holiday flag as logical type
date_info <- date_info %>% mutate(holiday_flg = as.logical(holiday_flg),
                                  date = ymd(calendar_date))

3 Individual feature visualisations

Here we have a first look at the distribution of the feature in our individual data files before combining them for a more detailed analysis.

3.1 Air Visits space

# distribution of the number of visits per date for air space
 p1 <- air_visit %>%
  group_by(visit_date) %>%
  summarise(all_visitors = sum(visitors)) %>%
  ggplot(aes(visit_date, all_visitors)) +
  geom_line(col = "blue") +
  labs(y = "All visitors", x = "Date")

# plot the density of the nulber of visitors per day (with log)
# 20 seems tobe the must frequent sum of visitors per day
 p2 <- air_visit %>%
   ggplot(aes(visitors)) +
   geom_vline(xintercept = 20, color ="red") +
   geom_histogram(fill = "blue", bins = 30) +
   #xlim(0,150)
   scale_x_log10()

 # plot the median of visitors per week day
 p3 <- air_visit %>%
   mutate(wday = wday(visit_date, label = TRUE)) %>%
   group_by(wday) %>%
   summarise(visits = median(visitors)) %>%
   ggplot(aes(wday, visits, fill = wday)) +
   geom_col() +
   theme(legend.position = "none") +
   labs(x = "Day of the week", y = "Median Visitors")

 # plot the median of visitors per months
 p4  <- air_visit %>%
   mutate(Month = month(visit_date, label = TRUE)) %>%
   group_by(Month) %>%
   summarise(visits = median(visitors)) %>%
   ggplot(aes(Month, visits, fill = Month)) +
   geom_col() +
   theme(legend.position = 'none') +
   labs(x = "MOnth", y = "Median Visitors")

 # plot P1, P2, P3, p4 in the same plot
 layout <- matrix(c(1,1,1,1,2,3,4,4), 2, 4, byrow = TRUE)
 multiplot(p1, p2, p3, p4, layout=layout)

We will be forecasting for the last week of April plus May 2017, so let’s look at this time range in our 2016 training data:

# chech if is there holidays in period that will be predicted 
holydays <- date_info %>%
  filter(calendar_date > ymd("2016-04-15") & calendar_date < ymd("2016-06-15"), holiday_flg)

# plot the number of visitors per day for the periode from 2016-04-15 to 2016-06-15 wich corresponds to the same periode that we will predict

air_visit %>%
  filter(visit_date > ymd("2016-04-15") & visit_date < ymd("2016-06-15")) %>%
  group_by(visit_date) %>%
  summarise(all_visitors = sum(visitors)) %>%
  ggplot(aes(visit_date, all_visitors)) +
  geom_line() +
  geom_smooth(method = "loess", color = "blue", span = 1/10) +
  labs(y = "All visitors", x = "Date") +
  geom_vline(xintercept = holydays$date, color ="red") 

There are 3 holidays that match with a global increase of visits in all air space.

3.2 Air Reservations

# split date_time to date, hour and weekday for visits and reservations
foo <- air_reserve %>%
  mutate(reserve_date = date(reserve_datetime),
         reserve_hour = hour(reserve_datetime),
         reserve_wday = wday(reserve_datetime, label = TRUE),
         visit_date = date(visit_datetime),
         visit_hour = hour(visit_datetime),
         visit_wday = wday(visit_datetime, label = TRUE),
         diff_hour = time_length(visit_datetime - reserve_datetime, unit = "hour"),
         diff_day = time_length(visit_datetime - reserve_datetime, unit = "day"))

# plot visits per date
  p1 <- foo %>%
  group_by(visit_date) %>%
  summarise(all_vitisors = sum(reserve_visitors)) %>%
  ggplot(aes(visit_date, all_vitisors)) +
  geom_line() +
  labs(x = "Air visit date")

  # plot visits per time (hour)
  p2 <- foo %>%
    group_by(visit_hour) %>%
    summarise(all_visitors = sum(reserve_visitors)) %>%
    ggplot(aes(visit_hour, all_visitors)) +
    geom_col(fill = "blue") +
    labs(x = "Time from reservation to visit [hours]")

  # plot density of reservation delay
  p3 <- foo %>%
    # limite reservation delay to 7 days
    filter(diff_hour < 24 *7) %>%
    group_by(diff_hour) %>%
    summarise(all_visitors = sum(reserve_visitors)) %>%
    ggplot(aes(diff_hour, all_visitors)) +
    geom_col(fill= "blue") +
    labs(x = "Time from reservation (hours)", y = "all_visitors")

  # plot muliple plot
  layout <- matrix(c(1, 1, 2, 3), 2, 2, byrow = TRUE)
  multiplot(p1, p2, p3, layout = layout)

3.3 HPG Reservation exploration

# split date_time to date, hour, week day
too <- hpg_reserve %>%
  mutate(visit_date = date(visit_datetime),
         visit_hour = hour(visit_datetime),
         visit_wday = wday(visit_datetime),
         reserve_date = date(reserve_datetime),
         reserve_hour = hour(reserve_datetime),
         reserve_wday = wday(reserve_datetime),
         diff_hour = time_length(visit_datetime - reserve_datetime, unit = "hour"),
         diff_day = time_length(visit_datetime - reserve_datetime, unit = "day")
         )

# plot visits per date
p1 <- too %>%
  group_by(visit_date) %>%
  summarise(all_visitors = sum(reserve_visitors)) %>%
  ggplot(aes(visit_date, all_visitors)) +
  geom_line()

# plot visits per time (hour)
p2 <- too %>%
  group_by(visit_hour) %>%
  summarise(all_visitors = sum(reserve_visitors)) %>%
  ggplot(aes(visit_hour, all_visitors)) +
  geom_col(fill="red")

# plot the density of the delay of reservation
p3 <- too %>%
  group_by(diff_hour) %>%
  summarise(all_visitors = sum(reserve_visitors)) %>%
  ggplot(aes(diff_hour, all_visitors)) +
  geom_col(fill = "red") +
  xlim(0,150)

 layout <- matrix(c(1,1,2,3), 2, 2, byrow = TRUE)
 multiplot(p1, p2, p3, layout = layout)
## Warning: Removed 1978 rows containing missing values (position_stack).
## Warning: Removed 2 rows containing missing values (geom_col).

3.4 Air Store info exploration

leaflet(air_store_info) %>%
  addTiles() %>%
  addProviderTiles("CartoDB.Positron") %>%
  addMarkers(~longitude, ~latitude,
             popup = ~air_store_id, label = ~air_genre_name,
             clusterOptions = markerClusterOptions()
             )
## plot the number of restaurant per area name (top 15)
p1 <- air_store_info %>%
  group_by(air_area_name) %>%
  count() %>%
  ungroup() %>%
  top_n(15,n) %>%
  ggplot(aes(reorder(air_area_name, n, FUN = min), n, fill = air_area_name )) +
  geom_col()+
  coord_flip()+
  theme(legend.position = "none") +
  labs(x= "Top 15 area (air_aera_name)", y ="Number restaurants")

# plot the number of the most frequente genre of restaurant
p2 <- air_store_info %>%
  group_by(air_genre_name) %>%
  count() %>%
  #ungroup() %>%
  top_n(15, n) %>%
  ggplot(aes(reorder(air_genre_name, n), n, fill = air_genre_name)) +
  geom_col() +
  coord_flip()+
  theme(legend.position = "none")

layout <- matrix(c(1,2), 2,1, byrow = TRUE)
multiplot(p1,p2, layout = layout)

3.5 HPG Store ifo exploration

# plot stores localisation
leaflet(hpg_store_info) %>%
  addTiles() %>%
  addProviderTiles("cartoDB.positron") %>%
  addMarkers(~longitude, ~latitude,
             popup = ~hpg_store_id, label = ~hpg_genre_name,
             clusterOptions = markerClusterOptions()
             )
# plot the number of store by genre
p1 <- hpg_store_info %>%
  group_by(hpg_genre_name) %>%
  count() %>%
  ggplot(aes(reorder(hpg_genre_name, n, FUN = min), n, fill = hpg_genre_name)) +
  geom_col() +
  coord_flip() +
  theme(legend.position = "none") +
  labs(x = "Type of cuisine (hpg_genre_name)", y = "Number of hpg restaurants")

# plot stores per area name
p2 <- hpg_store_info %>%
  group_by(hpg_area_name) %>%
  count() %>%
  arrange(desc(n)) %>%
  head(15) %>%
  ggplot(aes(reorder(x = hpg_area_name, n, FUN = min), n, fill = hpg_area_name)) +
  geom_col() +
  coord_flip() +
  theme(legend.position = "none") +
  labs(x = "Top 15 areas (hpg_area_name)", y = "Number of hpg restaurants")

layout <- matrix(c(1,2), 1,2, byrow=TRUE)
multiplot(p1, p2, layout = layout)

3.6 Holidays

# plot the number of holidays
date_info %>%
  group_by(holiday_flg) %>%
  count() %>%
  ggplot(aes(x = holiday_flg, y = n, fill = holiday_flg)) +
  geom_col() +
  theme(legend.position = "none")

# plot days and holidays
date_info %>%
  filter(date  > ymd("2017-04-15") & date < ymd("2017-06-01")) %>%
ggplot(aes(x = date, y= holiday_flg, color = holiday_flg)) +
geom_point() +
  theme(legend.position = "none")

3.7 Map date for training and testing

air_visit_train <- air_visit %>%
  rename(date = visit_date) %>%
  distinct(date) %>%
  mutate(dset= "train")

air_visit_test <- sample_submission %>%
  tidyr::separate(id, c("foo","bar", "date"), sep = "_") %>%
  mutate(date = ymd(date)) %>%
  distinct(date) %>%
  mutate(dset = "test")

air_visit_train <- air_visit_train %>%
                   dplyr::bind_rows(air_visit_test)

year(air_visit_train$date) <- 2017

air_visit_train %>%
  filter(!is.na(date))
## # A tibble: 516 x 2
##    date       dset 
##    <date>     <chr>
##  1 2017-01-13 train
##  2 2017-01-14 train
##  3 2017-01-15 train
##  4 2017-01-16 train
##  5 2017-01-18 train
##  6 2017-01-19 train
##  7 2017-01-20 train
##  8 2017-01-21 train
##  9 2017-01-22 train
## 10 2017-01-23 train
## # … with 506 more rows
  #mutate(year = forcats::fct_relevel(as.factor(year), c("2017", "2016")))